Manifold Learning Experiments
Contents
Manifold Learning Experiments¶
1) Loading Data¶
import numpy as np
import pandas as pd
import seaborn as sns
import deciphering_enigma
import matplotlib.pyplot as plt
#define the experiment config file path
path_to_config = './config.yaml'
#read the experiment config file
exp_config = deciphering_enigma.load_yaml_config(path_to_config)
dataset_path = exp_config.dataset_path
#register experiment directory and read wav files' paths
audio_files = deciphering_enigma.build_experiment(exp_config)
audio_files = [s for s in audio_files if s.endswith('mic1_normloud.wav')]
print(f'Dataset has {len(audio_files)} samples')
Dataset has 44455 samples
#extract metadata from file name convention
metadata_df, audio_format = deciphering_enigma.extract_metadata(exp_config, audio_files)
metadata_df.drop(columns=['xx', 'Label'], inplace=True)
#load audio files as torch tensors to get ready for feature extraction
audio_tensor_list = deciphering_enigma.load_dataset(audio_files, cfg=exp_config, speaker_ids=metadata_df['ID'], audio_format=audio_format)
Audio Tensors are already saved for vctk_umap_experiment
import soundfile as sf
from tqdm import tqdm
dur = []
for file in tqdm(audio_files):
audio, sr = sf.read(file)
dur.append(len(audio)/sr)
100%|████████████████████████████████████| 44455/44455 [05:10<00:00, 143.28it/s]
2) Generating Embeddings¶
#generate speech embeddings
feature_extractor = deciphering_enigma.FeatureExtractor()
embeddings_dict = feature_extractor.extract(audio_tensor_list, exp_config)
Load TERA Model
TERA embeddings are already saved for vctk_umap_experiment
(44455, 768)
import matplotlib
from pylab import cm
import matplotlib as mpl
matplotlib.font_manager._fmcache
matplotlib.font_manager._rebuild()
mpl.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['axes.linewidth'] = 3
3) Unsupervised Dimensionality Reduction¶
import os
import numpy as np
import pandas as pd
import scipy
from scipy.spatial.distance import pdist
from umap import UMAP
from pacmap import PaCMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from deciphering_enigma.settings import _hyperparams_grid_reducers, _optimize_function, _knn, _subsetsize
class ReducerTuner():
"""Tuner for dimensionality reduction methods.
Implements grid-search across hyperparameters for each dimensionality reduction method preset in the settings script.
NOTE: any method added in the settings script should follow sklearn implementation.
Tunes reduced dimensions by optimizing local and global structure metrics.
Saves tuned results for each method as a pandas dataframe.
"""
def __init__(self):
self.reducer_params_grid = _hyperparams_grid_reducers
self.optimize_func = _optimize_function
self.knn = _knn; self.subsetsize = _subsetsize
def embedding_quality(self, X, Z, knn=10, subsetsize=1000):
nbrs1 = NearestNeighbors(n_neighbors=knn).fit(X)
ind1 = nbrs1.kneighbors(return_distance=False)
nbrs2 = NearestNeighbors(n_neighbors=knn).fit(Z)
ind2 = nbrs2.kneighbors(return_distance=False)
intersections = 0.0
for i in range(X.shape[0]):
intersections += len(set(ind1[i]) & set(ind2[i]))
mnn = intersections / X.shape[0] / knn
subset = np.random.choice(X.shape[0], size=subsetsize, replace=True)
d1 = pdist(X[subset,:])
d2 = pdist(Z[subset,:])
rho = scipy.stats.spearmanr(d1[:,None],d2[:,None]).correlation
return (mnn, rho)
def get_reducer(self, name):
if name == 'PCA':
return PCA
elif name == 'tSNE':
return TSNE
elif name == 'UMAP':
return UMAP
elif name == 'PaCMAP':
return PaCMAP
else:
raise AttributeError(f'This reducer {name} is not included...')
def fit_eval(self, embeddings, reducer):
stand_embeddings = StandardScaler().fit_transform(embeddings)
reduced_embeddings = reducer.fit_transform(stand_embeddings)
local_val, global_val = self.embedding_quality(stand_embeddings, reduced_embeddings, knn=self.knn, subsetsize=self.subsetsize)
return reduced_embeddings, local_val, global_val
def save_results_pandas(self, reducers_embeddings_dict, metadata=None, model_name=None, dataset_name=None):
save_path = f'../{dataset_name}/{model_name}/dim_reduction_3d.csv'
combined_column_obj = pd.MultiIndex.from_product([reducers_embeddings_dict.keys(),['Local', 'Global'], ['Dim1', 'Dim2', 'Dim3']], names=["Method", "Optimized Metric", "Dim"])
df = pd.DataFrame(data=[], columns=combined_column_obj)
for j, name in enumerate(reducers_embeddings_dict.keys()):
global_embeddings = reducers_embeddings_dict[name]['Global']
local_embeddings = reducers_embeddings_dict[name]['Local']
df.loc[:, (name, 'Local', 'Dim1')] = local_embeddings[:,0]
df.loc[:, (name, 'Local', 'Dim2')] = local_embeddings[:,1]
df.loc[:, (name, 'Local', 'Dim3')] = local_embeddings[:,2]
df.loc[:, (name, 'Global', 'Dim1')] = global_embeddings[:,0]
df.loc[:, (name, 'Global', 'Dim2')] = global_embeddings[:,1]
df.loc[:, (name, 'Global', 'Dim3')] = global_embeddings[:,2]
temp_df = metadata.copy()
temp_df.columns = pd.MultiIndex.from_tuples(map(lambda x: (x, '', ''), temp_df.columns))
df = pd.concat([df, temp_df], axis=1)
df.to_csv(save_path)
def tune_reducer(self, embeddings, metadata=None, dataset_name=None, model_name=None, save_results = True, save_path='./'):
reducers_embeddings_dict = {}
metrics_dict = {}
df_path = f'../{dataset_name}/{model_name}/dim_reduction_3d.csv'
if os.path.isfile(df_path):
print(f'Tuned Reduced Embeddings already saved for {model_name} model!')
else:
for i, (reducer_name, reducer_params) in enumerate(self.reducer_params_grid.items()):
print(f'Reducer {i+1}/{len(self.reducer_params_grid.keys())}: {reducer_name}...')
reducers_embeddings_dict[reducer_name] = {}
reducer_object = self.get_reducer(reducer_name)
params_iterator = list(ParameterGrid(reducer_params))
all_embeddings = []; local_metrics = []; global_metrics = []
for params in params_iterator:
print(params)
reducer = reducer_object(n_components=3, random_state=42, **params)
reduced_embeddings, local_metric, global_metric = self.fit_eval(embeddings, reducer)
all_embeddings.append(reduced_embeddings); local_metrics.append(local_metric); global_metrics.append(global_metric)
max_local_idx = np.argmax(local_metrics)
max_global_idx = np.argmax(global_metrics)
metrics_dict[reducer_name] = {'Local': np.max(local_metrics), 'Global': np.max(global_metrics)}
reducers_embeddings_dict[reducer_name]['Local'] = all_embeddings[max_local_idx]
reducers_embeddings_dict[reducer_name]['Global'] = all_embeddings[max_global_idx]
if save_results:
self.save_results_pandas(reducers_embeddings_dict, metadata, model_name, dataset_name)
tuner = deciphering_enigma.ReducerTuner()
for i, model_name in enumerate(embeddings_dict.keys()):
print(f'{model_name}:')
tuner.tune_reducer(embeddings_dict[model_name], metadata=metadata_df, dataset_name=exp_config.dataset_name, model_name=model_name)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [7], in <cell line: 2>()
1 tuner = deciphering_enigma.ReducerTuner()
----> 2 for i, model_name in enumerate(embeddings_dict.keys()):
3 print(f'{model_name}:')
4 tuner.tune_reducer(embeddings_dict[model_name], metadata=metadata_df, dataset_name=exp_config.dataset_name, model_name=model_name)
NameError: name 'embeddings_dict' is not defined
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplots
def visualize_embeddings(df, label_name, metrics=[], axis=[], acoustic_param={}, opt_structure='Local', red_name='PCA', row=1, col=1, hovertext='', label=''):
traces = px.scatter(x=df[red_name, opt_structure, 'Dim1'], y=df[red_name, opt_structure, 'Dim2'], color=df[label_name], hover_name=df['AudioNames'])
traces.layout.update(showlegend=False)
axis.add_traces(
list(traces.select_traces()),
rows=row, cols=col
)
optimize = 'Global'
label = 'ID'
# fig, ax = plt.subplots(1, 1, figsize=(20, 10))
fig = make_subplots(rows=1, cols=1)
model_name = 'Log-Mel-Spectrogram'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-A_default'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-I_default'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-S_default'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-S_cvt'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'Hybrid_BYOL-S_cvt'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'APC'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'TERA'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=2, cols=4)
model_name = 'Wav2Vec2_latent'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=4000,
height=2000, showlegend=False,)
fig.show()
fig = make_subplots(rows=2, cols=4)
model_name = 'Wav2Vec2'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=4000,
height=2000, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT_latent'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT_best'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
df['Scentence'] = df['AudioNames'].apply(lambda x: x.split('_')[1])
visualize_embeddings(df, 'Scentence', metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
df['Duration'] = np.log(dur)
df['Duration'] = df['Duration'].astype(float)
visualize_embeddings(df, 'Duration', metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'Data2Vec_latent'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=1000,
height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=2, cols=4)
model_name = 'Data2Vec'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
autosize=False,
width=4000,
height=2000, showlegend=False,)
fig.show()
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplots
def visualize_3d_embeddings(df, label_name, metrics=[], axis=[], acoustic_param={}, opt_structure='Local', red_name='PCA', row=1, col=1, hovertext='', label=''):
traces = px.scatter_3d(x=df[red_name, opt_structure, 'Dim1'], y=df[red_name, opt_structure, 'Dim2'], z=df[red_name, opt_structure, 'Dim3'], color=df[label_name], hover_name=df['AudioNames'])
traces.layout.update(showlegend=False)
model_name = 'TERA'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction_3d.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 7_level_1': '', 'Unnamed: 7_level_2': '', 'Unnamed: 8_level_1': '', 'Unnamed: 8_level_2': ''},inplace=True)
fig = px.scatter_3d(x=df['UMAP', 'Global', 'Dim1'], y=df['UMAP', 'Global', 'Dim2'], z=df['UMAP', 'Global', 'Dim3'], color=df[label], hover_name=df['AudioNames'])
fig.update_layout(
autosize=False,
width=1000,
height=1000, showlegend=False,)
fig.show()